# This file mainly presents how to get Pathway 4's related analysis results

# Results can be obtained in this file include:

      ## Appendix 4.3: Logistic Regression Results (Pathway 4)
      ## Table 7: Predictive Accuracy Results  (Pathway 4)
      ## Table 8: Variance Inflation Factor Values of Variables (Pathway 4 part)
      ## Table 9: Models’ performance metrics (Pathway 4 part)

## R version: 4.4.2

#-----------------Prepare the libraries and the dataset------------------------#

rm(list = ls())  

# Libraries  
library(readxl)  
library(tidyverse)  
library(fixest)        
library(lme4)          
library(pROC)          
library(ResourceSelection) 
library(car)
library(knitr)

options(scipen = 100)  

# Input the data

PSIdata <- read_excel("~/Desktop/PSI.xlsx") 
PSIdata <- PSIdata %>%  
  mutate(  
    Country = as.factor(Country),  
    Year = as.factor(as.character(Year))
  )  

#--------------Appendix 4.3: Logistic Regression Results (Pathway 4)-----------#

# 1) Create train/test splits -------------------------------------------------  
# text_type==0 means UNPSA text; text_type==1 means synthetic (AI-generated) text

train_data12 <- PSIdata %>% filter(Year == "2018")  
test_data12  <- PSIdata %>% filter(text_type == 0, Year %in% c("2019", "2020", "2021", "2022"))  

train_data13 <- PSIdata %>% filter(Year %in% c("2018", "2019", "2020", "2021"))  
test_data13  <- PSIdata %>% filter(text_type == 0, Year == "2022")  

# predictor vector  

predictors <- c("innovation", "impact", "replicability",  
                "subjectivity", "numeric", "emotionality", "certainty","wordcount")  
pred_formula_rhs <- paste(predictors, collapse = " + ")  


fit_glmer <- function(train_df, model_name) {  
  re_terms <- c()  
  if (n_distinct(train_df$Country) > 1) re_terms <- c(re_terms, "(1|Country)")  
  if (n_distinct(train_df$Year) > 1)    re_terms <- c(re_terms, "(1|Year)")  
  re_part <- if (length(re_terms) > 0) paste("+", paste(re_terms, collapse = " + ")) else ""  
  fmla <- as.formula(paste0("win ~ ", pred_formula_rhs, " ", re_part))  
  message("Fitting GLMM for ", model_name, ": ", deparse(fmla))  
  glmer(fmla, data = train_df, family = binomial(link = "logit"),  
        control = glmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 2e5)))  
}  

# Fit all four GLMM models
model12 <- fit_glmer(train_data12, "Model 12")  
model13 <- fit_glmer(train_data13, "Model 13")  
 

# Extract coefficients from GLMM models
extract_glmm_coefficients <- function(model, model_name) {
  coef_summary <- summary(model)$coefficients
  coef_df <- as.data.frame(coef_summary)
  coef_df$Variable <- rownames(coef_df)
  coef_df$Model <- model_name
  coef_df <- coef_df[, c("Model", "Variable", "Estimate", "z value", "Pr(>|z|)")]
  names(coef_df) <- c("Model", "Variable", "Coefficient", "z_value", "p_value")
  
  coef_df$Significance <- ifelse(coef_df$p_value < 0.001, "***",
                                 ifelse(coef_df$p_value < 0.01, "**",
                                        ifelse(coef_df$p_value < 0.05, "*", "")))
  return(coef_df)
}

# 4) Output results-------------------------------------------------------------

# Model 12 regression results---------------------------------------------------- 
coef_model12 <- extract_glmm_coefficients(model12, "Model 12 (Train: 2018)")
kable(coef_model12[, c("Variable", "Coefficient", "z_value", "p_value", "Significance")],
      digits = 3,
      row.names = FALSE,
      caption = "MODEL 12 (UNPSA Winner of 2019-2022)")
cat("N (Train) =", nrow(train_data12), "observations\n")
cat("N (Test) =", nrow(test_data12), "observations\n\n")


# Model 13 regression results---------------------------------------------------
coef_model13 <- extract_glmm_coefficients(model13, "Model 13 (Train: 2018-2021)")
kable(coef_model13[, c("Variable", "Coefficient", "z_value", "p_value", "Significance")],
      digits = 3,
      row.names = FALSE,
      caption = "MODEL 13 (UNPSA Winner of 2022)")
cat("N (Train) =", nrow(train_data13), "observations\n")
cat("N (Test) =", nrow(test_data13), "observations\n\n")


#-------------Table 7: Predictive Accuracy Results  (Pathway 4)---------------#

# 1) calculate the model performance metrics------------------------------------

calculate_pr_auc <- function(actual, probs) {
  if (length(actual) != length(probs)) {
    stop("Not match")
  }
  
  # remove NA value
  valid_idx <- !is.na(actual) & !is.na(probs)
  actual <- actual[valid_idx]
  probs <- probs[valid_idx]
  
  if (sum(actual) == 0) {
    return(0)
  }
  
  # calculate Precision-Recall curve
  pr_curve <- data.frame(
    threshold = seq(0, 1, by = 0.01),
    precision = NA,
    recall = NA
  )
  
  for (i in 1:nrow(pr_curve)) {
    threshold <- pr_curve$threshold[i]
    pred_class <- ifelse(probs >= threshold, 1, 0)
    
    # Calculate the confusion matrix
    tp <- sum(actual == 1 & pred_class == 1)
    tn <- sum(actual == 0 & pred_class == 0)
    fp <- sum(actual == 0 & pred_class == 1)
    fn <- sum(actual == 1 & pred_class == 0)
    
    # calculate Precision and Recall
    precision <- ifelse((tp + fp) > 0, tp / (tp + fp), 1)
    recall <- ifelse((tp + fn) > 0, tp / (tp + fn), 0)
    
    pr_curve$precision[i] <- precision
    pr_curve$recall[i] <- recall
  }
  
  pr_curve <- pr_curve[order(pr_curve$recall, -pr_curve$precision), ]
  pr_curve <- pr_curve[!duplicated(pr_curve$recall), ]
  
  # calculate the PR AUC 
  pr_auc <- 0
  for (i in 2:nrow(pr_curve)) {
    width <- pr_curve$recall[i] - pr_curve$recall[i-1]
    height <- (pr_curve$precision[i] + pr_curve$precision[i-1]) / 2
    pr_auc <- pr_auc + width * height
  }
  
  return(pr_auc)
}

calculate_metrics <- function(actual, predicted, probs) {
  conf_matrix <- table(actual, predicted)
  accuracy <- sum(diag(conf_matrix)) / sum(conf_matrix)
  recall <- conf_matrix[2,2] / sum(conf_matrix[2,])
  specificity <- conf_matrix[1,1] / sum(conf_matrix[1,])
  precision <- conf_matrix[2,2] / sum(conf_matrix[,2])
  f1_score <- 2 * (precision * recall) / (precision + recall)
  
  # ROC AUC
  roc_obj <- roc(actual, probs)
  auc_val <- auc(roc_obj)
  
  # PR AUC
  pr_auc_val <- calculate_pr_auc(actual, probs)
  
  return(list(
    confusion_matrix = conf_matrix,
    accuracy = accuracy,
    recall = recall,
    specificity = specificity,
    precision = precision,
    f1_score = f1_score,
    auc = auc_val,
    pr_auc = pr_auc_val
  ))
}

pred_prob12 <- predict(model12, newdata = test_data12, type = "response", allow.new.levels = TRUE)  
pred_prob13 <- predict(model13, newdata = test_data13, type = "response", allow.new.levels = TRUE)  

pred_class12 <- ifelse(pred_prob12 >= 0.5, 1, 0)  
pred_class13 <- ifelse(pred_prob13 >= 0.5, 1, 0)  
  

# 2) output the predictive accuracy results (in Table 6)------------------------

metrics12 <- calculate_metrics(test_data12$win, pred_class12, pred_prob12)
metrics13 <- calculate_metrics(test_data13$win, pred_class13, pred_prob13)

# Predictive Accuracy Results of Model 12---------------------------------------
print(metrics12) 

# Predictive Accuracy Results of Model 13---------------------------------------
print(metrics13) 


#---------Table 8: Variance Inflation Factor Values of Variables---------------#
# Models 12, 13

round(vif(model12), 3)
round(vif(model13), 3)

#-----------------------Table 9: Models’ performance metrics-------------------#
# Models 12, 13 part

# 1) construct the performance table--------------------------------------------

performance_table <- data.frame(
  Model = c("Model 12", "Model 13"),
  Precision = c(metrics12$precision, metrics13$precision),
  Recall = c(metrics12$recall, metrics13$recall),
  F1_Score = c(metrics12$f1_score, metrics13$f1_score),
  PR_AUC = c(metrics12$pr_auc, metrics13$pr_auc)
)

performance_table[, -1] <- round(performance_table[, -1], 3)

# 2) Output the performance results---------------------------------------------

cat("\nPerformance Metrics Table:\n")
print(performance_table, row.names = FALSE)



